### Reading Data
require('readr')
### Data Wrangling
require('dplyr')
require('tidyr')
require('tibble')
require('DMwR') ## KNN Imputation
### Visualization
require('ggplot2')
require('Amelia')
require('plotly')
train <- read_csv("./data/train.csv")
IdTrain <- train$Id
YTrain <- train$Target
XTrain <- train %>% dplyr::select(-Id, -Target)
test <- read_csv("./data/test.csv")
IdTest <- test$Id
XTest <- test %>% dplyr::select(-Id)
XTrain %>% dplyr::glimpse()
## Observations: 1,318
## Variables: 83
## $ V1 <int> 2, 3, 4, 3, 3, 3, 3, 4, 3, 5, 4, 2, 3, 2, 2, 3, 3, 3, 3, 3...
## $ V2 <int> 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 5...
## $ V3 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5...
## $ V4 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,...
## $ V5 <int> 303, 180, 621, 518, 840, 532, 260, 554, 328, 444, 506, 520...
## $ V6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V7 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V8 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 5, 1, 1, 1, 1, 1...
## $ V9 <int> 5, 7, 13, 25, 6, 6, 13, 9, 6, 5, 17, 12, 21, 18, 1, 8, 9, ...
## $ V10 <int> 5, 7, 8, 7, 7, 7, 5, 7, 5, 6, 5, 5, 8, 5, 8, 6, 6, 7, 7, 5...
## $ V11 <int> 0, 0, 207, 47, 33, 74, 0, 54, 0, 168, 211, 0, 50, 0, 20, 3...
## $ V12 <int> 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ V13 <int> 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 6, 2, 2, 6, 2, 2, 2, 2, 2, 6...
## $ V14 <int> 10708, 11500, 15865, 17542, 10652, 9135, 7390, 8795, 8385,...
## $ V15 <int> 0, 1037, 0, 651, 0, 0, 0, 1276, 0, 1111, 0, 0, 0, 0, 0, 0,...
## $ V16 <int> 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0...
## $ V17 <int> 476, 0, 81, 220, 160, 192, 0, 224, 210, 133, 0, 0, 322, 0,...
## $ V18 <int> 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1...
## $ V19 <int> 14, 4, 14, 14, 13, 13, 14, 13, 7, 7, 10, 7, 13, 15, 13, 7,...
## $ V20 <int> 1867, 2057, 2217, 2167, 1494, 1536, 1098, 2256, 985, 2087,...
## $ V21 <int> 5, 7, 6, 7, 5, 5, 7, 5, 8, 8, 5, 5, 5, 6, 5, 7, 5, 6, 6, 5...
## $ V22 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3...
## $ V23 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V24 <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ V25 <int> 142, 322, 224, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 144, 0, 0,...
## $ V26 <int> 3, 3, 5, 3, 3, NA, NA, 5, NA, NA, NA, NA, 3, 3, 5, 2, 5, 5...
## $ V27 <int> 3, 3, 3, 3, 4, 2, 2, 3, 2, 2, 2, 3, 4, 3, 2, 2, 3, 2, 3, 3...
## $ V28 <int> 2, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ V29 <int> 2, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2...
## $ V30 <int> 2, 6, 5, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6...
## $ V31 <int> 3, 1, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1...
## $ V32 <int> 1955, 1936, 1970, 1974, 2006, 2002, 1955, 2000, 1977, 1966...
## $ V33 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4...
## $ V34 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V35 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V36 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V37 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V38 <int> 1, 1, 3, 2, 2, 2, 3, 1, 3, 1, 3, 2, 2, 3, 1, 2, 1, 3, 2, 3...
## $ V39 <int> 1617, 1017, 2217, 1192, 1494, 1536, 1098, 952, 985, 832, 1...
## $ V40 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ V41 <int> 4, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 1, 4, 3, 4, 3, 4, 3, 4...
## $ V42 <int> NA, 86, 95, NA, 91, 70, NA, NA, 65, NA, 87, 100, 85, 56, 4...
## $ V43 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4...
## $ V44 <int> 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V45 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 1, 5, 5, 2, 3...
## $ V46 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 9, 1, 3, 3, 3, 3, 5, 2...
## $ V47 <int> 379, 223, 351, 125, 0, 810, 902, 300, 595, 568, 0, 0, 1220...
## $ V48 <int> 20, 50, 20, 60, 20, 20, 20, 60, 80, 60, 90, 20, 20, 30, 12...
## $ V49 <int> 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 4, 4, 3, 4, 3, 4, 4, 4, 3, 4...
## $ V50 <dbl> 1.436426e+05, 4.972889e+04, 1.232003e+05, 1.562537e+04, -6...
## $ V51 <int> 3, 1, 3, 6, 3, 3, 3, 6, 8, 6, 3, 3, 3, 3, 3, 3, 6, 3, 6, 3...
## $ V52 <int> 4, 2, 1, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2...
## $ V53 <int> 4, 4, 2, 2, 1, 1, 4, 4, 1, 2, 4, 4, 3, 4, 4, 4, 2, 4, 1, 4...
## $ V54 <int> 470, 794, 1043, 36, 1494, 726, 196, 652, 390, 264, 1656, 9...
## $ V55 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ V56 <int> 15, 4, 15, 15, 14, 14, 15, 14, 7, 7, 11, 7, 14, 16, 14, 7,...
## $ V57 <int> 1, 3, 1, 5, 1, 1, 5, 1, 5, 3, 5, 1, 1, 3, 1, 5, 3, 2, 1, 5...
## $ V58 <int> 1, 1, 2, 2, 3, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1...
## $ V59 <int> 1993, 1987, 1970, 2003, 2007, 2003, 1955, 2000, 1977, 2007...
## $ V60 <int> 1, 1, 1, 1, 1, 4, 1, 1, 4, 1, 1, 4, 1, 4, 4, 1, 1, 4, 2, 4...
## $ V61 <int> 49, 36, 64, 81, 49, 49, 36, 64, 36, 81, 64, 16, 64, 25, 36...
## $ V62 <int> 1867, 1020, 2217, 1516, 1494, 1536, 1098, 980, 985, 976, 1...
## $ V63 <int> 7, 6, 8, 9, 7, 7, 6, 8, 6, 9, 8, 4, 8, 5, 6, 6, 7, 6, 6, 5...
## $ V64 <int> 4, 5, 1, 4, 6, 3, 1, 3, 3, 4, 6, 6, 3, 6, 3, 2, 3, 1, 3, 1...
## $ V65 <int> 3, 2, 3, 3, 1, 2, 2, 3, 1, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 2...
## $ V66 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V67 <int> 4, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 4, 3, 4, 3, 4, 4, 4, 3, 4...
## $ V68 <int> 768, 0, 823, 1031, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V69 <int> 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0...
## $ V70 <int> 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5...
## $ V71 <dbl> 114661264, 132250000, 251698225, 307721764, 113465104, 834...
## $ V72 <int> 0, 0, 0, 0, 306, 113, 151, 0, 220, 157, 564, 0, 420, 0, 16...
## $ V73 <int> 1, 9, 9, 9, 7, 9, 9, 9, 9, 9, 9, 9, 7, 9, 9, 9, 9, 9, 5, 9...
## $ V74 <int> 11, 6, 10, 7, 8, 12, 7, 4, 11, 7, 11, 2, 5, 3, 4, 5, 7, 9,...
## $ V75 <int> 2009, 2006, 2007, 2007, 2007, 2008, 2008, 2009, 2008, 2007...
## $ V76 <int> 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5...
## $ V77 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ V78 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130, 0, 0, 0, 0, 0,...
## $ V79 <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ V80 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V81 <int> 1955, 1936, 1970, 1974, 2006, 2002, 1955, 2000, 1977, 1966...
## $ V82 <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1...
## $ V83 <int> 4, 4, 2, 4, 4, 4, 4, 4, 2, 1, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4...
XTrain %>% summary()
## V1 V2 V3 V4
## Min. :0.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:5.000 1st Qu.:5.000 1st Qu.:1.000
## Median :3.000 Median :5.000 Median :5.000 Median :1.000
## Mean :2.866 Mean :4.689 Mean :4.904 Mean :1.476
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :8.000 Max. :5.000 Max. :5.000 Max. :2.000
## NA's :1 NA's :72 NA's :1236
## V5 V6 V7 V8
## Min. : 0.0 Min. : 0.0 Min. :0.000 Min. :1.000
## 1st Qu.: 326.2 1st Qu.: 0.0 1st Qu.:1.000 1st Qu.:1.000
## Median : 476.5 Median : 0.0 Median :1.000 Median :1.000
## Mean : 470.9 Mean : 41.5 Mean :1.045 Mean :1.478
## 3rd Qu.: 576.0 3rd Qu.: 0.0 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1418.0 Max. :15500.0 Max. :3.000 Max. :5.000
##
## V9 V10 V11 V12
## Min. : 1.00 Min. : 1.000 Min. : 0.0 Min. :1.000
## 1st Qu.: 8.00 1st Qu.: 5.000 1st Qu.: 0.0 1st Qu.:5.000
## Median :13.00 Median : 6.000 Median : 24.0 Median :5.000
## Mean :13.16 Mean : 6.111 Mean : 45.9 Mean :4.874
## 3rd Qu.:18.00 3rd Qu.: 7.000 3rd Qu.: 66.0 3rd Qu.:5.000
## Max. :25.00 Max. :10.000 Max. :547.0 Max. :5.000
## NA's :72
## V13 V14 V15 V16
## Min. :1.000 Min. : 1300 Min. : 0.0 Min. :0.0000
## 1st Qu.:2.000 1st Qu.: 7558 1st Qu.: 0.0 1st Qu.:0.0000
## Median :2.000 Median : 9502 Median : 0.0 Median :0.0000
## Mean :3.268 Mean : 10549 Mean : 350.6 Mean :0.3832
## 3rd Qu.:6.000 3rd Qu.: 11645 3rd Qu.: 731.5 3rd Qu.:1.0000
## Max. :6.000 Max. :215245 Max. :2065.0 Max. :2.0000
## NA's :72
## V17 V18 V19 V20
## Min. : 0.0 Min. :0.000 Min. : 1.00 Min. : 334
## 1st Qu.: 0.0 1st Qu.:1.000 1st Qu.: 9.00 1st Qu.:1132
## Median : 0.0 Median :2.000 Median :13.00 Median :1470
## Mean : 93.7 Mean :1.565 Mean :10.65 Mean :1520
## 3rd Qu.:168.0 3rd Qu.:2.000 3rd Qu.:13.00 3rd Qu.:1778
## Max. :857.0 Max. :3.000 Max. :15.00 Max. :5642
##
## V21 V22 V23 V24
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:5.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:7.000
## Median :5.000 Median :3.000 Median :2.000 Median :7.000
## Mean :5.596 Mean :2.857 Mean :1.939 Mean :6.751
## 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:7.000
## Max. :9.000 Max. :3.000 Max. :2.000 Max. :7.000
##
## V25 V26 V27 V28
## Min. : 0.00 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.: 0.00 1st Qu.:3.000 1st Qu.:2.00 1st Qu.:2.000
## Median : 0.00 Median :3.000 Median :3.00 Median :3.000
## Mean : 15.12 Mean :3.737 Mean :2.76 Mean :2.414
## 3rd Qu.: 0.00 3rd Qu.:5.000 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :480.00 Max. :5.000 Max. :4.00 Max. :4.000
## NA's :622 NA's :8 NA's :1069
## V29 V30 V31 V32
## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :1900
## 1st Qu.:2.000 1st Qu.:6.000 1st Qu.:0.0000 1st Qu.:1961
## Median :2.000 Median :6.000 Median :1.0000 Median :1980
## Mean :2.392 Mean :5.715 Mean :0.6168 Mean :1978
## 3rd Qu.:3.000 3rd Qu.:6.000 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :6.000 Max. :6.000 Max. :3.0000 Max. :2010
## NA's :31 NA's :72
## V33 V34 V35 V36
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median :4.000 Median :2.000 Median : 0.000 Median : 0.000
## Mean :3.775 Mean :1.996 Mean : 2.496 Mean : 5.724
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :4.000 Max. :2.000 Max. :648.000 Max. :572.000
##
## V37 V38 V39 V40 V41
## Min. :1 Min. :1.00 Min. : 0 Min. :1.000 Min. :1.000
## 1st Qu.:1 1st Qu.:1.00 1st Qu.: 799 1st Qu.:3.000 1st Qu.:3.000
## Median :1 Median :2.00 Median : 992 Median :3.000 Median :3.000
## Mean :1 Mean :2.18 Mean :1061 Mean :2.884 Mean :3.269
## 3rd Qu.:1 3rd Qu.:3.00 3rd Qu.:1292 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :1 Max. :3.00 Max. :6110 Max. :4.000 Max. :4.000
## NA's :72 NA's :1275 NA's :30
## V42 V43 V44 V45
## Min. : 21.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 60.00 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:3.000
## Median : 70.00 Median :4.000 Median :2.000 Median :5.000
## Mean : 70.34 Mean :4.029 Mean :2.077 Mean :4.017
## 3rd Qu.: 80.00 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :313.00 Max. :5.000 Max. :8.000 Max. :5.000
## NA's :232
## V46 V47 V48 V49
## Min. :1.000 Min. : 0.0 Min. : 20.0 Min. :1.000
## 1st Qu.:3.000 1st Qu.: 0.0 1st Qu.: 20.0 1st Qu.:3.000
## Median :3.000 Median : 384.5 Median : 50.0 Median :4.000
## Mean :3.027 Mean : 444.7 Mean : 56.3 Mean :3.537
## 3rd Qu.:3.000 3rd Qu.: 706.0 3rd Qu.: 70.0 3rd Qu.:4.000
## Max. :9.000 Max. :5644.0 Max. :190.0 Max. :4.000
##
## V50 V51 V52 V53
## Min. : -3 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median : 147840 Median :3.000 Median :2.000 Median :4.000
## Mean : 404778 Mean :4.025 Mean :2.429 Mean :3.277
## 3rd Qu.: 498436 3rd Qu.:6.000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :31854738 Max. :8.000 Max. :6.000 Max. :4.000
## NA's :31
## V54 V55 V56 V57
## Min. : 0.0 Min. :1.00 Min. : 1.00 Min. :1.000
## 1st Qu.: 237.2 1st Qu.:1.25 1st Qu.: 9.00 1st Qu.:1.000
## Median : 484.0 Median :2.00 Median :14.00 Median :1.000
## Mean : 570.0 Mean :2.00 Mean :11.37 Mean :2.524
## 3rd Qu.: 808.0 3rd Qu.:2.75 3rd Qu.:14.00 3rd Qu.:5.000
## Max. :2336.0 Max. :3.00 Max. :16.00 Max. :5.000
## NA's :1312
## V58 V59 V60 V61
## Min. :0.000 Min. :1950 Min. :1.000 Min. : 4.00
## 1st Qu.:1.000 1st Qu.:1966 1st Qu.:1.000 1st Qu.: 25.00
## Median :2.000 Median :1994 Median :4.000 Median : 36.00
## Mean :1.763 Mean :1985 Mean :2.936 Mean : 45.29
## 3rd Qu.:2.000 3rd Qu.:2004 3rd Qu.:4.000 3rd Qu.: 49.00
## Max. :4.000 Max. :2010 Max. :4.000 Max. :196.00
##
## V62 V63 V64 V65
## Min. : 334.0 Min. : 2.000 Min. :1.000 Min. :1.000
## 1st Qu.: 882.5 1st Qu.: 5.000 1st Qu.:2.000 1st Qu.:1.000
## Median :1088.0 Median : 6.000 Median :3.000 Median :2.000
## Mean :1163.5 Mean : 6.527 Mean :3.722 Mean :1.996
## 3rd Qu.:1389.8 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:3.000
## Max. :4692.0 Max. :14.000 Max. :6.000 Max. :3.000
## NA's :30
## V66 V67 V68 V69
## Min. :0.0000 Min. :1.000 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 0.00 1st Qu.:0.0000
## Median :0.0000 Median :4.000 Median : 0.00 Median :0.0000
## Mean :0.0569 Mean :3.338 Mean : 46.03 Mean :0.4256
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 0.00 3rd Qu.:1.0000
## Max. :2.0000 Max. :4.000 Max. :1474.00 Max. :3.0000
##
## V70 V71 V72 V73
## Min. :1.000 Min. :1.690e+06 Min. : 0.0 Min. :1.000
## 1st Qu.:5.000 1st Qu.:5.713e+07 1st Qu.: 0.0 1st Qu.:9.000
## Median :5.000 Median :9.028e+07 Median : 0.0 Median :9.000
## Mean :4.727 Mean :2.117e+08 Mean : 104.0 Mean :8.517
## 3rd Qu.:5.000 3rd Qu.:1.356e+08 3rd Qu.: 165.8 3rd Qu.:9.000
## Max. :5.000 Max. :4.633e+10 Max. :1600.0 Max. :9.000
## NA's :8
## V74 V75 V76 V77
## Min. : 1.000 Min. :2006 Min. :1.000 Min. :1.000
## 1st Qu.: 5.000 1st Qu.:2007 1st Qu.:5.000 1st Qu.:2.000
## Median : 6.000 Median :2008 Median :5.000 Median :2.000
## Mean : 6.296 Mean :2008 Mean :4.782 Mean :2.037
## 3rd Qu.: 8.000 3rd Qu.:2009 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :12.000 Max. :2010 Max. :6.000 Max. :6.000
##
## V78 V79 V80 V81
## Min. : 0.00 Min. :1.00 Min. : 0.000 Min. :1872
## 1st Qu.: 0.00 1st Qu.:3.00 1st Qu.: 0.000 1st Qu.:1954
## Median : 0.00 Median :3.00 Median : 0.000 Median :1972
## Mean : 22.44 Mean :3.01 Mean : 3.628 Mean :1971
## 3rd Qu.: 0.00 3rd Qu.:3.00 3rd Qu.: 0.000 3rd Qu.:2000
## Max. :552.00 Max. :8.00 Max. :508.000 Max. :2010
##
## V82 V83
## Min. :1.00 Min. :1.000
## 1st Qu.:1.00 1st Qu.:4.000
## Median :1.00 Median :4.000
## Mean :1.06 Mean :3.805
## 3rd Qu.:1.00 3rd Qu.:4.000
## Max. :3.00 Max. :4.000
## NA's :30
YTrain %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 130000 163000 181300 214000 755000
plotly::plot_ly(x = YTrain) %>%
plotly::add_histogram()
plotly::plot_ly(y = YTrain, type = "box", boxpoints = "all", jitter = 0.5,
pointpos = -1.8)
plotly::plot_ly(x = 1:length(YTrain), y = sort(YTrain)) %>%
plotly::add_lines() %>%
plotly::add_markers()
trainMissingData <- XTrain %>%
dplyr::mutate_all(is.na) %>%
dplyr::summarise_all(dplyr::funs(sum)) %>%
tidyr::gather(Var, NaCountTrain) %>%
dplyr::arrange(dplyr::desc(NaCountTrain)) %>%
dplyr::filter(NaCountTrain > 0)
testMissingData <- XTest %>%
dplyr::mutate_all(is.na) %>%
dplyr::summarise_all(sum) %>%
tidyr::gather(Var, NaCountTest) %>%
dplyr::arrange(dplyr::desc(NaCountTest)) %>%
dplyr::filter(NaCountTest > 0)
nTrainningObs <- nrow(XTrain)
nTestObs <- nrow(XTest)
missingResultTable <- trainMissingData %>%
dplyr::full_join(testMissingData, by = 'Var') %>%
dplyr::mutate(NaCountTrainPerc = 100 * round(NaCountTrain / nTrainningObs, 4),
NaCountTestPerc = 100 * round(NaCountTest / nTestObs, 4))
missingResultTable
## # A tibble: 19 x 5
## Var NaCountTrain NaCountTest NaCountTrainPerc NaCountTestPerc
## <chr> <int> <int> <dbl> <dbl>
## 1 V55 1312 1310 99.54 99.47
## 2 V40 1275 1268 96.74 96.28
## 3 V4 1236 1234 93.78 93.70
## 4 V28 1069 1069 81.11 81.17
## 5 V26 622 617 47.19 46.85
## 6 V42 232 235 17.60 17.84
## 7 V3 72 75 5.46 5.69
## 8 V12 72 75 5.46 5.69
## 9 V13 72 75 5.46 5.69
## 10 V32 72 75 5.46 5.69
## 11 V38 72 75 5.46 5.69
## 12 V30 31 38 2.35 2.89
## 13 V53 31 38 2.35 2.89
## 14 V41 30 37 2.28 2.81
## 15 V64 30 37 2.28 2.81
## 16 V83 30 37 2.28 2.81
## 17 V27 8 8 0.61 0.61
## 18 V72 8 8 0.61 0.61
## 19 V2 1 1 0.08 0.08
missingDataColNames <- missingResultTable$Var
XTrain %>%
dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
data.frame() %>%
Amelia::missmap(col = c("black", "grey"))
XTest %>%
dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
data.frame() %>%
Amelia::missmap(col = c("black", "grey"))
missingDataColNames <- missingResultTable$Var[1:6]
p <- XTrain %>%
dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
tidyr::gather(Var, Val) %>%
dplyr::filter(!is.na(Val)) %>%
dplyr::group_by(Var) %>%
dplyr::mutate(Val = (Val - mean(Val))/sd(Val) ) %>%
dplyr::ungroup() %>%
ggplot2::ggplot(aes(x = Var, fill = Var, color = Var, y = Val))
plotly::ggplotly(p + ggplot2::geom_boxplot())
plotly::ggplotly(p + ggplot2::geom_point())
p <- XTest %>%
dplyr::select( dplyr::one_of(missingDataColNames) ) %>%
tidyr::gather(Var, Val) %>%
dplyr::filter(!is.na(Val)) %>%
dplyr::group_by(Var) %>%
dplyr::mutate(Val = (Val - mean(Val))/sd(Val) ) %>%
dplyr::ungroup() %>%
ggplot2::ggplot(aes(x = Var, fill = Var, color = Var, y = Val))
plotly::ggplotly(p + ggplot2::geom_boxplot())
plotly::ggplotly(p + ggplot2::geom_point())
NAToZeroVars <- missingResultTable %>%
dplyr::filter(NaCountTrainPerc > 20.0) %>%
dplyr::select(Var) %>%
unlist() %>% as.character()
NNImputationVars <- missingResultTable %>%
dplyr::filter(NaCountTrainPerc < 20.0) %>%
dplyr::select(Var) %>%
unlist() %>% as.character()
preProcessNAs <- function(Data, NAToZeroVars, NNImputationVars){
auxColNames <- names(Data) %>% base::setdiff( c(NAToZeroVars, NNImputationVars ) )
auxData <- Data %>%
dplyr::select_( .dots = auxColNames )
NAToZeroData <- Data %>%
dplyr::select_(.dots = NAToZeroVars) %>%
dplyr::mutate_all( dplyr::funs( ifelse(is.na(.), 0, .) )
)
NNImputationData <- Data %>%
dplyr::select_(.dots = NNImputationVars) %>%
data.frame() %>%
DMwR::knnImputation(k = 10)
dplyr::bind_cols(auxData, NAToZeroData, NNImputationData) %>%
data.frame()
}
XTrain <- XTrain %>%
preProcessNAs(NAToZeroVars = NAToZeroVars, NNImputationVars = NNImputationVars)
XTest <- XTest %>%
preProcessNAs(NAToZeroVars = NAToZeroVars, NNImputationVars = NNImputationVars)
p <- XTrain %>%
tidyr::gather(key = Var, value = Val) %>%
dplyr::group_by(Var) %>%
dplyr::mutate(Val = ( Val - min(Val) ) / ( max(Val) - min(Val) ) ) %>%
dplyr::ungroup() %>%
data.frame() %>%
ggplot2::ggplot(aes(x = Var, y = Val)) +
ggplot2::geom_point()
p
## Warning: Removed 1318 rows containing missing values (geom_point).
XTrain %>%
tidyr::gather(key = Var, value = Val) %>%
unique() %>%
dplyr::group_by(Var) %>%
dplyr::summarise(UniqueValuesCount = n()) %>%
dplyr::arrange(UniqueValuesCount) %>%
data.frame()
## Var UniqueValuesCount
## 1 V37 1
## 2 V23 2
## 3 V34 2
## 4 V16 3
## 5 V22 3
## 6 V4 3
## 7 V65 3
## 8 V66 3
## 9 V82 3
## 10 V18 4
## 11 V31 4
## 12 V33 4
## 13 V49 4
## 14 V55 4
## 15 V60 4
## 16 V67 4
## 17 V69 4
## 18 V7 4
## 19 V2 5
## 20 V28 5
## 21 V40 5
## 22 V43 5
## 23 V45 5
## 24 V57 5
## 25 V58 5
## 26 V70 5
## 27 V75 5
## 28 V8 5
## 29 V26 6
## 30 V29 6
## 31 V52 6
## 32 V76 6
## 33 V77 6
## 34 V24 7
## 35 V35 7
## 36 V1 8
## 37 V44 8
## 38 V51 8
## 39 V79 8
## 40 V21 9
## 41 V46 9
## 42 V73 9
## 43 V10 10
## 44 V27 12
## 45 V61 12
## 46 V63 12
## 47 V74 12
## 48 V19 15
## 49 V48 15
## 50 V56 16
## 51 V80 19
## 52 V6 21
## 53 V36 22
## 54 V9 25
## 55 V41 27
## 56 V83 28
## 57 V30 31
## 58 V53 31
## 59 V64 34
## 60 V3 36
## 61 V12 42
## 62 V38 53
## 63 V59 61
## 64 V13 66
## 65 V25 70
## 66 V81 111
## 67 V78 114
## 68 V68 129
## 69 V32 165
## 70 V11 188
## 71 V17 260
## 72 V72 320
## 73 V42 336
## 74 V15 393
## 75 V5 428
## 76 V47 593
## 77 V39 684
## 78 V62 718
## 79 V54 740
## 80 V20 806
## 81 V14 978
## 82 V71 978
## 83 V50 1318
XTrain <- XTrain %>% dplyr::select(-V37)
XTest <- XTest %>% dplyr::select(-V37)
p <- XTrain %>%
tidyr::gather(key = Var, value = Val) %>%
dplyr::group_by(Var) %>%
dplyr::mutate(Val = ( Val - min(Val) ) / ( max(Val) - min(Val) ) ) %>%
dplyr::ungroup() %>%
data.frame() %>%
ggplot2::ggplot(aes(x = Var, y = Val)) +
ggplot2::geom_boxplot() +
ggplot2::coord_flip()
p
preProcessScaleAndCenter <- function(Data){
Data %>%
dplyr::mutate_all( dplyr::funs( ( . - min(.) ) / ( max(.) - min(.) ) ) )
}
XTrain <- XTrain %>% preProcessScaleAndCenter()
XTest <- XTest %>% preProcessScaleAndCenter()
PCAModel <- prcomp(x = XTrain %>% data.matrix())
auxPCAData <- summary(PCAModel)$importance %>%
t() %>%
data.frame() %>%
tibble::rownames_to_column(var = "Component") %>%
dplyr::mutate(Component = factor(x = Component,
levels = Component %>% as.character()))
p <- auxPCAData %>%
ggplot2::ggplot(aes(x = Component, y = Standard.deviation)) +
ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
p <- auxPCAData %>%
ggplot2::ggplot(aes(x = Component, y = Proportion.of.Variance)) +
ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
p <- auxPCAData %>%
ggplot2::ggplot(aes(x = Component, y = Cumulative.Proportion)) +
ggplot2::geom_bar(stat = 'identity')
plotly::ggplotly(p = p)
preProcessPCA <- function(Data, PCAModel, nComponents){
predict(object = PCAModel, newdata = Data) %>%
data.frame() %>%
dplyr::select(1:nComponents)
}
XTrain <- XTrain %>% preProcessPCA(PCAModel = PCAModel, nComponents = 45)
XTest <- XTest %>% preProcessPCA(PCAModel = PCAModel, nComponents = 45)
readr::write_csv(x = XTrain, path = 'data/processedTrainData.csv')
readr::write_csv(x = XTest, path = 'data/processedTestData.csv')
readr::write_csv(x = YTrain %>% data.frame(), path = 'data/TrainLabels.csv')